ScReNI tutorial

xueli xu

2024-09-09

Installation

Follow the steps below to install ScReNI package from GitHub and run it:

install.packages('devtools')
devtools::install_github('Xuxl2020/ScReNI')

# require
library(ScReNI)
# load requried packages
library(Seurat) # remotes::install_version("Seurat", "5.0.1", repos = c("https://satijalab.r-universe.dev", getOption("repos")))
library(SeuratObject) # remotes::install_version("SeuratObject", "5.0.1", repos = c("https://satijalab.r-universe.dev", getOption("repos")))
library(Signac) # 1.10.0
library(dplyr)
library(ggpubr)
library(monocle3)
library(ggplot2)
library(igraph)
library(Matrix) ###remotes::install_version("Matrix", version = "1.6-5")
library(harmony) ###remotes::install_version("harmony", version = "0.1.1")
library(sctransform) ###remotes::install_version("sctransform", version = "0.4.1")
library(GenomeInfoDb)
library(BSgenome.Mmusculus.UCSC.mm10)
library(EnsDb.Mmusculus.v79)
library(ComplexHeatmap)
library(IReNA)
library(patchwork)
library(future)
library(cowplot)

Introduction

ScReNI infers the regulatory network for each cell through integrating unpaired scRNA-seq and scATAC-seq data. This comprehensive tutorial is structured into five steps: (1) Clustering Integration; (2) Inference of Single Cell-Specific Networks; (3) Evaluation of Regulatory Networks; (4) Cell Clustering Based on Networks; (5) Identification of Cell-enriched Regulators.

Input Data

Seurat objects and intermediate files can be accessed and downloaded through the following link: Datasets Please ensure that the Seurat object is properly formatted and includes all necessary metadata for accurate analysis.

# Define data information
data.name = 'mmRetina_RPCMG'
species = 'mouse'
data.type = 'unpaired'
# Please set your path
path = 'data/'
scRNAseq <- readRDS(paste0(path, data.name, "_scRNAseq.rds"))  
scATACseq <- readRDS(paste0(path, data.name, "_scATACseq.rds"))

Step 1: perform clustering to integrate scRNA-seq and scATAC-seq data

In this Step, we’ll demonstrate how to jointly analyze a single-cell dataset measuring both DNA accessibility and gene expression in the same cells using Signac and Seurat. For more detailed content and introduction, please refer to the website (https://stuartlab.org/signac/articles/pbmc_multiomic)

# Step 1.1: integrate scRNA-seq and scATAC-seq
# Define integration parameters
IntegratedDimensions <- 20
KNN <- 20
AnchorsDim <- 50
# for unpaired scRNA-seq and scATAC-seq data
coembed <- Integrate_scRNA_scATAC(scRNAseq, scATACseq, AnchorsDim, IntegratedDimensions, KNN, data.type='unpaired')
# for paired scRNA-seq and scATAC-seq data
# coembed <- Integrate_scRNA_scATAC(scRNAseq, scATACseq, IntegratedDimensions, KNN, data.type='paired', species)
# Step 1.2: plot figures for the integrated scRNAseq and scATACseq
coembed <- readRDS('data/mmRetina_RPCMG_integrated_scRNA_scATAC_D30.rds')
groups=c('datatypes', 'samples', 'celltypes')
print(DimPlot(coembed, group.by = groups[1], reduction = "umap", pt.size = 0.8, label = F) 
      |DimPlot(coembed, group.by = groups[2], reduction = "umap", pt.size = 0.8, label = F)
      |DimPlot(coembed, group.by = groups[3], reduction = "umap", pt.size = 0.8, label = T))

# Step 1.3 only for unpaired data: get the nearest neighbor of each cell. However, for paired data, step 1.3 is not necessary. Instead, you can proceed directly to step 2 of the analysis pipeline.
scRNA_scATAC_neighbor <- Get_scRNA_scATAC_neighbors(coembed)
scRNA_scATAC_neighbor_dup <- scRNA_scATAC_neighbor[[1]]
scRNA_scATAC_neighbor_undup <- scRNA_scATAC_neighbor[[2]]

Step 2: infer single cell-specific networks

kScReNI infers cell-specific regulatory networks using transcriptomes alone. wScReNI infers cell-specific regulatory networks by integrating scRNA-seq and scATAC-seq data.

# set the parameters
cell.num = 100 # number of each cell type
ngenes = 500 # number of highly variable genes
npeaks = 10000 #  number of highly variable peaks 
result.path = 'results/' # please set the result path
network.path = paste0(result.path, 'NetworksC', cell.num, '/')  # the path to save networks
# Step 2.1: get partial cells to infer scNetworks
# partial cells
Partial_cell <- c("RPCs_S1", "RPCs_S2", "RPCs_S3", "MG")
# for unpaired data
scRNA.scATAC.cells <- Select_partial_cells_for_scNewtorks(coembed, Partial_cell, cell.num, data.type = "unpaired", scRNA_scATAC_neighbor_undup)
# for paired data
# scRNA.scATAC.cells <- Select_partial_cells_for_scNewtorks(coembed, Partial_cell, cell.num, data.type = "paired")
# Step 2.2: select top variable genes and peaks to infer scNetworks
sub.scrna.top <- select_features(sub.scrna, ngenes, datatype='RNA')
sub.scatac.top <- select_features(sub.scatac, npeaks, datatype='ATAC')
# Step 2.3: infer scNetworks using kScReNI
kScReNI.weights <- Infer_kScReNI_scNetworks(sub.scrna.top, ngenes, knn=KNN)

When employing wScReNI, it is essential to leverage reference datasets. These datasets can be conveniently accessed and downloaded via the following link: Reference dataset. They serve as an indispensable resource for deciphering the intricate connections between genes and peaks, a fundamental step in the construction of regulatory networks. However, if you have prior knowledge of the specific relationships between genes and peaks, you can proceed directly to Step 2.4.2 of the network construction phase.

# Step 2.4: infer scNetworks using wScReNI
# Step 2.4.1: identification of peak-associated genes
library(BSgenome.Mmusculus.UCSC.mm10) ### 1.4.3
genome_database = BSgenome.Mmusculus.UCSC.mm10
motif_database <- read.table("Tranfac201803_Mm_MotifTFsFinal.txt", header = T)
gtf_data <- as.data.frame(rtracklayer::import("mouse.genes.gtf"))
motif_pwm <- readRDS("all_motif_pwm.rds")

gene_peak_overlap <- Infer_gene_peak_relationships(gtf_data, 
                                                   sub.scrna.top, 
                                                   sub.scatac.top,
                                                   motif_database, 
                                                   motif_pwm, 
                                                   genome_database = genome_database)

gene_peak_overlap_matrix <- peakMat(sub.scatac.top, gene_peak_overlap)
gene_peak_overlap_labs <- peak_gene_TF_labs(gene_peak_overlap)
# Step 2.4.2: get the nearest neighbor of each cell
scRNA_scATAC_WNN <- Integrate_scRNA_scATAC(sub.scrna.top, sub.scatac.top, IntegratedDimensions=20, KNN, data.type='paired')
nearest.neighbors.idx <- scRNA_scATAC_WNN[['weighted.nn']]@nn.idx
rownames(nearest.neighbors.idx) <- scRNA_scATAC_WNN[['weighted.nn']]@cell.names
# Step 2.4.3: infer scNetworks using wScReNI
wScReNI.weights <- Infer_wScReNI_scNetworks(sub.scrna.top, gene_peak_overlap_matrix, gene_peak_overlap_labs, nearest.neighbors.idx, network.path, data.name, cell.index=1:400)
wScReNI.weights <- Combine_wScReNI_scNetworks(sub.scrna.top, network.path)

Step 3: perform cell clustering based on single cell-specific networks

For the purpose of network-based cell clustering, we calculated gene degrees derived from cell-specific networks. To evaluate the influence of the number of pairwise relationships between TFs and target genes, we applied diverse thresholds to the ranked weights.

# Step 4.1: Calculate the degrees
ncell = 400; ntype = 4; top = rep(500, ncell) # top 500 gene regulation pairs
celltype <- read.csv(paste0('data/', data.name, "_Cell", cell.num, "_annotation.csv"))
sub.celltype <- celltype$undup.cell.types
scNetworks_degree <- calculate_scNetwork_degree(scNetworks, top, sub.celltype, ntype, column_name='sub.celltype')
# Step 4.2: Plot cell clustering based on the networks
load('data/scNetworks_degree.rdata')
col = list(Cell_type = c("MG" = "red", "RPCs_S1" = "green", "RPCs_S2" = "blue", "RPCs_S3" = "purple"))
Network_types <- names(scNetworks_degree)
i=3
print(Network_types[i])
## [1] "kScReNI"
degree <- scNetworks_degree[[i]]
outdegree <- degree[['outdegree']]
out.degree <- degree[['out.degree.umap']]
# plot the heatmap based on the similarity of the degree matrix
column_ha = HeatmapAnnotation("Cell_type" = sub.celltype, col = col)
Heatmap(cor(log(outdegree+1)), 
                top_annotation = column_ha, 
                show_row_names = F, 
                show_column_names = F)

out.degree[['lab']] <- sub.celltype
DimPlot(out.degree, reduction = "umap", group.by = "lab", label = F)

i=5
print(Network_types[i])
## [1] "wScReNI"
degree <- scNetworks_degree[[i]]
outdegree <- degree[['outdegree']]
out.degree <- degree[['out.degree.umap']]
# plot the heatmap based on the similarity of the degree matrix
column_ha = HeatmapAnnotation("Cell_type" = sub.celltype, col = col)
Heatmap(cor(log(outdegree+1)), 
                top_annotation = column_ha, 
                show_row_names = F, 
                show_column_names = F)

out.degree[['lab']] <- sub.celltype
DimPlot(out.degree, reduction = "umap", group.by = "lab", label = F)

Step 4: identify regulators enriched in each single-cell regulatory network

We used Monocle3 to investigate pseudotime trajectories in scRNA-seq data. Cells were clustered via UMAP reduction, and the principal graph was determined using the learn_graph function. Any RPCs_S1 cell was set as the root when running order_cells.

# Step 4.1: perform trajectory analysis to calculate pseudotime
DefaultAssay(coembed) <- "RNA"
load('data/gene.use.rdata') 
seurat_object <- coembed[gene.use,]
expression_matrix0 <- as.matrix(seurat_object@assays$RNA@counts)
seurat.obj <- seurat_object[,Matrix::colSums(expression_matrix0) != 0]
expression_matrix <- expression_matrix0[,Matrix::colSums(expression_matrix0) != 0]
cell_metadata <- seurat_object@meta.data[Matrix::colSums(expression_matrix0) != 0,]
gene_annotation <- data.frame(gene_short_name=rownames(expression_matrix))
rownames(gene_annotation) <- rownames(expression_matrix)
# monocle3
cds <- new_cell_data_set(expression_matrix,
                         cell_metadata = cell_metadata,
                         gene_metadata = gene_annotation)

# NormalizeData+ScaleData+RunPCA
cds <- preprocess_cds(cds, num_dim = 30)
cds <- reduce_dimension(cds, preprocess_method = "PCA")

# umap
cds.embed <- cds@int_colData$reducedDims$UMAP
int.embed <- Embeddings(seurat.obj, reduction = "umap")
int.embed <- int.embed[rownames(cds.embed),]
cds@int_colData$reducedDims$UMAP <- int.embed
cds@clusters$UMAP$clusters <- seurat.obj@meta.data[rownames(colData(cds)), "seurat_clusters"]
cds@clusters$UMAP$partitions <- factor(x = rep(1, length(rownames(colData(cds)))), levels = 1)
names(cds@clusters$UMAP$partitions) <- rownames(colData(cds))

cds <- estimate_size_factors(cds)
cds@rowRanges@elementMetadata@listData$gene_short_name <- rownames(cds)
rownames(cds@principal_graph_aux[["UMAP"]]$dp_mst) <- NULL
colnames(cds@int_colData@listData$reducedDims@listData$UMAP) <- NULL
cds <- learn_graph(cds,learn_graph_control=list("euclidean_distance_ratio"=5, "geodesic_distance_ratio"=2, "minimal_branch_len"=1000))
    
load('data/cds_learn_graph.rdata')
p1 = plot_cells(cds,
           color_cells_by = "seurat_clusters",
           label_groups_by_cluster = FALSE,
           label_leaves = TRUE,
           label_branch_points = TRUE) +
  theme(panel.border = element_rect(fill=NA, color="black", size=1, linetype="solid"))
cds <- order_cells(cds)
load('data/cds_order_cells.rdata')
p2 = plot_cells(cds,
          color_cells_by = "pseudotime",
          label_cell_groups = F,
          label_leaves = F,
          label_branch_points = F,
          graph_label_size = 1.5) +
  theme(panel.border = element_rect(fill=NA, color="black", size=1, linetype="solid"))

# Add pseudotime to the Seurat object
pseudotime <- pseudotime(cds, reduction_method = 'UMAP')
load('data/seurat.obj.rdata')
pseudotime <- pseudotime[rownames(seurat.obj@meta.data)]
seurat.obj$Pseudotime <- pseudotime
p3 = FeaturePlot(seurat.obj, reduction = "umap", features = "Pseudotime")
plot_grid(p1, p2, p3, nrow=1)

# Step 4.2: smooth the expression of highly variable genes according to pseudotime
# Get expression profiles ordered by pseudotime
seurat_with_time = seurat.obj
expression_profile <- IReNA::get_SmoothByBin_PseudotimeExp(seurat_with_time, Bin = 50)
# Step 4.3: modularized highly variable genes through k-means of the smoothed expression
# Filter noise and logFC in expression profile
expression_profile_filter <- IReNA::filter_expression_profile(expression_profile, FC=0.01)
# K-means clustering
clustering <- IReNA::clustering_Kmeans(expression_profile_filter, K1=10)
clustering <- read.table('data/clustering_revised.txt', head=T)
Kmeans_clustering_ENS <- add_ENSID(clustering, Spec1='Mm')
# KEGG
library(org.Mm.eg.db)
keytypes(org.Mm.eg.db)
##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
##  [6] "ENTREZID"     "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"    
## [11] "GENETYPE"     "GO"           "GOALL"        "IPI"          "MGI"         
## [16] "ONTOLOGY"     "ONTOLOGYALL"  "PATH"         "PFAM"         "PMID"        
## [21] "PROSITE"      "REFSEQ"       "SYMBOL"       "UNIPROT"
library(KEGG.db)
enrichment_KEGG <- enrich_module(Kmeans_clustering_ENS, org.Mm.eg.db, enrich.db = 'KEGG', organism = 'mmu')
enrichment_KEGG[1:2,]
##                ID           Description module -log10(q-value) GeneRatio
## mmu04115 mmu04115 p53 signaling pathway      1        1.956574     7/104
## mmu04310 mmu04310 Wnt signaling pathway      1        1.454788     9/104
##           BgRatio       pvalue   p.adjust     qvalue
## mmu04115  70/6698 9.544578e-05 0.01221706 0.01105162
## mmu04310 154/6698 6.061403e-04 0.03879298 0.03509234
##                                                         geneID Count
## mmu04115            12444/12447/170770/12445/12122/75747/67874     7
## mmu04310 12444/20319/17869/57265/12445/72293/13380/18021/12301     9
# GO
enrichment_GO <- enrich_module(Kmeans_clustering_ENS, org.Mm.eg.db, 'GO', organism = 'mmu')
enrichment_GO[1:2,]
##                    ID                         Description module
## GO:0060537 GO:0060537           muscle tissue development      1
## GO:0042744 GO:0042744 hydrogen peroxide catabolic process      1
##            -log10(q-value) GeneRatio   BgRatio       pvalue     p.adjust
## GO:0060537        4.870024    22/284 494/28943 4.732070e-09 1.688876e-05
## GO:0042744        4.710916     7/284  30/28943 1.365178e-08 2.436160e-05
##                  qvalue
## GO:0060537 1.348889e-05
## GO:0042744 1.945738e-05
##                                                                                                                                             geneID
## GO:0060537 12444/108655/114142/57810/101401/16002/14609/17898/11819/17869/67040/22003/56449/228019/16011/13380/21388/14725/18021/14775/12301/75547
## GO:0042744                                                                                               15135/15126/15132/15122/69675/15129/14775
##            Count
## GO:0060537    22
## GO:0042744     7
Mm_func = enrichment_GO[order(enrichment_GO$module,decreasing = T),c(2,3,9)]
Mm_func$score = -log10(Mm_func$qvalue)
Mm_func$rank = 1:nrow(Mm_func)
Mm_func$module = factor(Mm_func$module)
col = c(colors <- c(rgb(174/255,199/255,232/255), rgb(103/255,193/255,227/255), rgb(91/255,166/255,218/255), rgb(0/255,114/255,189/255), rgb(253/255,209/255,176/255), rgb(239/255,153/255,81/255)))
plot_kmeans_pheatmap(clustering[order(clustering$KmeansGroup,decreasing = F),],
                    ModuleColor1 = c(colors <- col))

ggplot2::ggplot(Mm_func,aes(x=reorder(Description,rank),y=score))+
  geom_bar(position="dodge",stat="identity",aes(fill = module))+
  theme_classic()+ theme(panel.grid=element_blank(),text=element_text(size=15,face = "bold"))+
  xlab(NULL)+coord_flip() + ylab('-log10(qvalue)')+scale_fill_discrete(name = "module")+
  scale_fill_manual(values=col)

# Step 4.4: Identification of cell-enriched regulators
gtf <- read.delim("refer/mouse.genes.gtf", header=FALSE, comment.char="#")
source('R/cell_enriched_regulator.R')
source('R/CACIMAR_R_NewFunctions.R')
source('R/network_analysis.R')
# enriched regulator for cell one
cell.one_regulator_enrich <- regulator_enrich(wScReNI.weights[[1]], Kmeans_clustering_ENS, Tranfac201803_Mm_MotifTFsF, gtf, BSgenome.Mmusculus.UCSC.mm10, top_num_weig=3000)
# enriched regulator for each cell 
cell_enriched_regulator(wScReNI.weights, Kmeans_clustering_ENS, Tranfac201803_Mm_MotifTFsF, gtf, BSgenome.Mmusculus.UCSC.mm10, 3000)